Change: Model is run multiple times with different train and test sets

library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer)
library(biomaRt)
library(Rtsne)
library(caret) ; library(ROCR) ; library(car)
library(corrplot)
library(expss) ; library(knitr)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Background


In 20_04_07_LR.html we performed a logistic regression but we discovered that the features were strongly correlated. This inflates the standard error of the coefficients, making them no longer interpretable.

# Clusterings
clustering_selected = 'DynamicHybridMergedSmall'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname

assigned_module = data.frame('ID' = clusterings$ID, 'Module' = clusterings$Module)

# Original dataset
original_dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)

# Model dataset
load('./../Data/LR_model.RData')

# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org')
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)

Variance Inflation Factor (VIF) and correlation plot

# VIF
plot_data = data.frame('Feature' = car::vif(fit) %>% sort %>% names,
                       'VIF' = car::vif(fit) %>% sort %>% unname) %>%
            mutate(outlier = VIF>10)

plot_data %>% ggplot(aes(reorder(Feature, -VIF), VIF, fill = !outlier)) + geom_bar(stat='identity') + scale_y_log10() +
              geom_hline(yintercept = 10, color = 'gray', linetype = 'dashed') + xlab('Model Features') + theme_minimal() +
              theme(legend.position = 'none', axis.text.x = element_text(angle = 90, hjust = 1))

# Correlation plot
corrplot.mixed(cor(train_set[,-ncol(train_set)]), lower = 'number', lower.col = 'gray', number.cex = .6, tl.pos = 'l', tl.col = '#666666')

rm(getinfo, mart, clusterings)

Possible solutions to Multicollinearity:


  1. Remove all variables with a VIF>10: We would lose all but three of our variables, not ideal

  2. Do Principal Component Regression: We would lose the relation between the prediction and the original features, which would be interesting to study

  3. Don’t do anything: Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref, but as with the previous option, we cannot study the coefficients of the regression

  4. Use Ridge Regression


Ridge Regression


Notes:

### DEFINE FUNCTIONS

# Create positive training set including all SFARI scores
positive_sample_balancing_SFARI_scores = function(p, seed){
  
  set.seed(seed)
  positive_train_idx = c()
  
  for(score in 1:6){
    score_genes = rownames(original_dataset)[rownames(original_dataset) %in% rownames(dataset) & original_dataset$gene.score == score]
    score_idx = which(rownames(dataset) %in% score_genes)
    score_train_idx = sample(score_idx, size = ceiling(p*length(score_idx)))
    
    positive_train_idx = c(positive_train_idx, score_train_idx)
  }
  
  return(positive_train_idx)
}

create_train_test_sets = function(p, over_sampling_fold, seed){
  
  ### CREATE POSITIVE TRAINING SET (balancing SFARI scores and over-sampling)
  positive_train_idx = positive_sample_balancing_SFARI_scores(p, seed)
  add_obs = sample(positive_train_idx, size = ceiling(over_sampling_fold*length(positive_train_idx)), replace=TRUE)
  positive_train_idx = c(positive_train_idx, add_obs)
  
  
  ### CREATE NEGATIVE TRAINING SET
  negative_idx = which(!dataset$SFARI)
  negative_train_idx = sample(negative_idx, size = length(positive_train_idx))
  
  
  ### CREATE TRAIN AND TEST SET
  train_set = dataset[sort(c(positive_train_idx, negative_train_idx)),]
  test_set = dataset[-unique(c(positive_train_idx, negative_train_idx)),]
  
  return(list('train_set' = train_set, 'test_set' = test_set))
  
}

run_model = function(p, over_sampling_fold, seed){
  
  # Create train and test sets
  train_test_sets = create_train_test_sets(p, over_sampling_fold, seed)
  train_set = train_test_sets[['train_set']]
  test_set = train_test_sets[['test_set']]
  
  # Train Model
  train_set$SFARI = train_set$SFARI %>% as.factor
  lambda_seq = 10^seq(1, -4, by = -.1)
  set.seed(123)
  fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = trainControl('cv', number = 10),
                tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
  
  # Predict labels in test set
  predictions = fit %>% predict(test_set, type='prob')
  preds = data.frame('ID'=rownames(test_set), 'prob'=predictions$`TRUE`) %>% mutate(pred = prob>0.5)

  # Measure performance of the model
  acc = mean(test_set$SFARI==preds$pred)
  pred_ROCR = prediction(preds$prob, test_set$SFARI)
  AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
  
  # Extract coefficients from features
  coefs = coef(fit$finalModel, fit$bestTune$lambda) %>% as.vector
  
  return(list('acc' = acc, 'AUC' = AUC, 'preds' = preds, 'coefs' = coefs))
}


### RUN MODEL

# Parameters
p = 0.8
over_sampling_fold = 3
n_iter = 100
seeds = 123:(123+n_iter-1)

# Store outputs
acc = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)
coefs = data.frame('var' = c('Intercept', colnames(dataset[,-ncol(dataset)])), 'coef' = 0)

for(seed in seeds){
  
  # Run model
  model_output = run_model(p, over_sampling_fold, seed)
  
  # Update outputs
  acc = c(acc, model_output[['acc']])
  AUC = c(AUC, model_output[['AUC']])
  preds = model_output[['preds']]
  coefs$coef = coefs$coef + model_output[['coefs']]
  update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
  predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] + update_preds
}

coefs = coefs %>% mutate(coef = coef/n_iter)
predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)

rm(p, over_sampling_fold, seeds, update_preds, positive_sample_balancing_SFARI_scores, create_train_test_sets, run_model)


To summarise in a single value the predictions of the models:

test_set = predictions %>% filter(n>0) %>% left_join(dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID')
rownames(test_set) = predictions$ID[predictions$n>0]


Performance metrics


Confusion matrix

conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels', 
                                     prob = 'Assigned Probability', 
                                     pred = 'Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 Label Prediction     #Total 
 FALSE   TRUE   
 Actual Labels 
   FALSE  9730 5407   15137
   TRUE  358 539   897
   #Total cases  10088 5946   16034
rm(conf_mat)

Accuracy

cat(paste0('ACCURACY: mean = ', round(mean(acc),4), ' SD = ', round(sd(acc),4)))
## ACCURACY: mean = 0.6396 SD = 0.0069


ROC Curve

cat(paste0('AUC:      mean = ', mean(AUC), ' SD = ', sd(AUC)))
## AUC:      mean = 0.670549396655147 SD = 0.0192981599597913
pred_ROCR = prediction(test_set$prob, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(auc,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')

Lift Curve

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)




Coefficients


MTcor has a very small coefficient and Gene Significance has a negative coefficient (!)

gene_corr_info = dataset %>% mutate('ID' = rownames(dataset)) %>% dplyr::select(ID, MTcor, SFARI) %>% left_join(assigned_module, by ='ID') %>%
                 mutate(Module = gsub('#','',Module))

coef_info = coefs %>% mutate('feature' = gsub('MM.','',var)) %>% left_join(gene_corr_info, by = c('feature' = 'Module')) %>% 
            dplyr::select(feature, coef, MTcor, SFARI) %>% group_by(feature, coef, MTcor) %>% summarise('SFARI_perc' = mean(SFARI)) %>%
            arrange(desc(coef))

kable(coef_info %>% dplyr::select(feature, coef) %>% rename('Feature' = feature, 'Coefficient' = coef),
      align = 'cc', caption = 'Regression Coefficients')
Regression Coefficients
Feature Coefficient
FF62BC 1.0193924
00A7FF 0.6409614
FE6E8A 0.5537652
00BF7D 0.5186754
00BA38 0.5089766
00BFC4 0.4421878
D376FF 0.3966287
00BCD8 0.3877020
00C0AF 0.3398744
D89000 0.2897373
00C097 0.1894138
39B600 0.1828604
F8766D 0.1794108
EF7F49 0.1381873
6BB100 0.0798353
F564E3 0.0697580
E58700 0.0664270
absGS 0.0349896
B79F00 0.0225067
FF67A4 0.0067750
MTcor -0.0031499
00B0F6 -0.0149717
A3A500 -0.0681002
GS -0.1314690
E76BF3 -0.1357313
B983FF -0.1912169
Intercept -0.2056533
FD61D1 -0.2101999
00BD5F -0.2324720
8AAB00 -0.2491964
C99800 -0.2767617
9590FF -0.4379413
00B7E9 -0.5280243
619CFF -0.6140371

  • There is a positive relation between the coefficient assigned to the membership of each module and the percentage of SFARI genes that are assigned to that module
ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
              ggplot(aes(coef, SFARI_perc)) +  geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
              geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)])) + 
              theme_minimal() + xlab('Coefficient') + 
              ylab('% of SFARI genes in Module'))


  • There doesn’t seem to be a relation between the coefficient and the correlation of the module and the diagnosis.

This is not a surprise since we knew that there was a negative relation between SFARI genes and Module-Diagnosis correlation from Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_WGCNA_modules_EA.html. The fact that there is no relation between coefficient and Module-Diagnosis correlation could even be a good sign that the model is picking some biological signal as well as the SFARI patterns (since the relation with the biological signals is positive)

ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
              ggplot(aes(coef, MTcor)) +  geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
              geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)])) + 
              theme_minimal() + xlab('Coefficient') + 
              ylab('Module-Diagnosis correlation'))




Analyse model


SFARI genes have a slightly higher score distribution than the rest

plot_data = test_set %>% dplyr::select(prob, SFARI)

ggplotly(plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
         geom_vline(xintercept = mean(plot_data$prob[plot_data$SFARI]), color = '#00C0C2', linetype = 'dashed') +
         geom_vline(xintercept = mean(plot_data$prob[!plot_data$SFARI]), color = '#FF7371', linetype = 'dashed') +
         theme_minimal() + ggtitle('Model score distribution by SFARI Label'))
  • There seems to be a msall but consistent positive relation between the SFARI scores and the probability assigned by the model
plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  25
   2  64
   3  191
   4  432
   5  163
   6  22
   None  15137
   #Total cases  16034
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))

ggplotly(plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI score') + ylab('Probability') + theme_minimal())
rm(mean_vals)

Genes with highest scores in test set


  • Considering the class imbalance in the test set (1:69), there are many more SFARI scores in here (1:3)

  • 3 of the 5 SFARI genes with a score of 1 in the test set are in this top 50 list

  • There aren’t any SFARI genes with a score of 6

test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>% 
             left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
             dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' = MTcor, 'GeneSignificance' = GS) %>%
             mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr,4), Probability = round(Probability,4), 
                    GeneSignificance = round(GeneSignificance,4)) %>%
             dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability, gene.score) %>%
             kable(caption = 'Genes with highest model probabilities from the test set')
Genes with highest model probabilities from the test set
GeneSymbol GeneSignificance ModuleDiagnosis_corr Module Probability gene.score
RPRD2 -0.0922 -0.6031 #00BA38 0.8433 None
PRDM2 -0.2518 0.0586 #FE6E8A 0.8393 None
TANC2 -0.2348 -0.6031 #00BA38 0.8367 3
EP300 -0.0234 0.1127 #FF62BC 0.8346 4
MYCBP2 -0.3975 -0.6031 #00BA38 0.8316 None
ARID1B 0.2711 0.1127 #FF62BC 0.8281 1
ANK2 -0.4368 -0.9514 #00C0AF 0.8258 1
SRCAP -0.3623 -0.6031 #00BA38 0.8250 2
RFX7 0.1372 0.0586 #FE6E8A 0.8243 None
NFAT5 0.1035 0.0586 #FE6E8A 0.8232 None
KMT2A 0.1347 0.7916 #00C097 0.8203 1
FMNL1 -0.2223 -0.6031 #00BA38 0.8181 None
HUNK -0.3273 -0.6750 #D376FF 0.8175 None
MBD5 0.1943 0.0586 #FE6E8A 0.8173 3
RNF111 -0.2410 0.0586 #FE6E8A 0.8170 None
TLE3 0.4884 0.1127 #FF62BC 0.8166 None
SRRM4 -0.4400 -0.6031 #00BA38 0.8159 5
RAPH1 -0.0534 -0.6031 #00BA38 0.8138 None
KMT2D -0.3255 -0.6031 #00BA38 0.8127 None
C20orf112 0.0314 0.1127 #FF62BC 0.8089 None
ANK3 -0.4814 0.0586 #FE6E8A 0.8078 3
HECTD4 -0.3170 -0.6031 #00BA38 0.8076 3
EP400 -0.1671 -0.6031 #00BA38 0.8054 3
CREBBP 0.1431 0.1127 #FF62BC 0.8050 5
KCNJ6 -0.1379 -0.9514 #00C0AF 0.8043 None
TP53INP2 -0.2140 -0.4946 #B79F00 0.8041 None
DROSHA -0.4111 -0.6031 #00BA38 0.8040 None
ASAP1 -0.0896 -0.6031 #00BA38 0.8032 None
DLGAP4 -0.3307 -0.6031 #00BA38 0.8022 5
ETV6 -0.1418 -0.6031 #00BA38 0.8011 None
CACNG3 -0.4689 -0.6031 #00BA38 0.8008 None
GATAD2B -0.4221 -0.6031 #00BA38 0.8008 None
ATN1 -0.2052 -0.6031 #00BA38 0.7982 None
RASGRF1 -0.7425 -0.9514 #00C0AF 0.7981 None
DAAM1 0.1579 -0.0094 #00A7FF 0.7978 None
NRXN3 -0.6716 -0.9514 #00C0AF 0.7977 3
RIMBP2 0.2615 -0.0094 #00A7FF 0.7969 None
HUWE1 -0.5026 -0.6031 #00BA38 0.7968 None
NFIX -0.4208 -0.6031 #00BA38 0.7966 None
TLN2 -0.4783 -0.9514 #00C0AF 0.7962 None
CELF2 -0.0605 -0.9514 #00C0AF 0.7951 None
GNAS -0.5865 -0.6031 #00BA38 0.7949 4
POM121C -0.2791 -0.6031 #00BA38 0.7945 None
SETD1B -0.1561 -0.6031 #00BA38 0.7939 4
ZNF804A -0.2768 -0.6750 #D376FF 0.7935 3
SAP130 -0.2482 -0.6031 #00BA38 0.7922 None
ARID1A -0.1378 -0.6031 #00BA38 0.7920 None
HIVEP2 0.0165 -0.9514 #00C0AF 0.7919 None
NBEA -0.4147 -0.9514 #00C0AF 0.7915 4
CLIP3 -0.5368 -0.6031 #00BA38 0.7906 None




Negative samples distribution


Selecting the Negative samples in the test set

negative_set = test_set %>% filter(!SFARI)

negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability', 
                                                   pred = 'Label Prediction')
cro(negative_set_table$pred)
 #Total 
 Label Prediction 
   FALSE  9730
   TRUE  5407
   #Total cases  15137
cat(paste0('\n', sum(negative_set$pred), ' genes are predicted as ASD-related'))
## 
## 5407 genes are predicted as ASD-related
negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
                 geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + 
                 ggtitle('Probability distribution of the Negative samples in the test set') + 
                 theme_minimal()


Probability and Gene Significance


  • There’s a lot of noise, but the probability the model assigns to each gene seems to have a negative relation with the Gene Significance (under-expressed genes having on average the higher probabilities and over-expressed genes the lowest)

  • The pattern is strongest in under-expressed genes

negative_set %>% ggplot(aes(prob, GS, color=MTcor)) + geom_point() + geom_smooth(method='loess', color='#666666') +
                 geom_hline(yintercept=0, color='gray', linetype='dashed') + xlab('Probability') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Probability and Gene Significance') + theme_minimal()


Probability and Module-Diagnosis correlation


On average, the model seems to be assigning a probability inversely proportional to the Module-Diagnosis correlation of the module, with the highest positively correlated modules having the lowest average probability and the highest negatively correlated modules the highest average probability. But the difference isn’t big

negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + geom_smooth(method='loess', color='#666666') + 
                 geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) + 
                 xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
                 theme_minimal()

Summarised version, plotting by module instead of by gene

The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

The model seems to give higher probabilities to genes belonging to modules with a small (absolute) correlation to Diagnosis (this is unexpected)

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% left_join(original_dataset, by='MTcor') %>%
            dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID)) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Model') + theme_minimal())


Probability and level of expression


There is a positive relation between level of expression and probability, the model seems to be capturing indirectly the level of expression of the genes to make the prediction, so it’s introducing the same bias

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.2, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) + 
              theme_minimal() + ggtitle('Mean expression vs model probability by gene')

rm(mean_and_sd)
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + geom_point(color=plot_data2$Module) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + 
         geom_smooth(method='lm', se=FALSE, color='gray') + theme_minimal() + theme(legend.position='none') + 
         ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)


Probability and lfc


There is a relation between probability and lfc, so it **IS*() capturing a bit of true information (because lfc and mean expression were negatively correlated and it still has a positive relation in the model)

  • The relation is stronger in genes under-expressed in ASD
plot_data = negative_set %>% left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')

plot_data %>% ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              theme_minimal() + ggtitle('lfc vs model probability by gene')

  • The relation is stronger in Differentially Expressed genes
p1 = plot_data %>% filter(log2FoldChange<0) %>% mutate(DE = padj<0.05) %>% ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) + 
                   geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + 
                   ylim(c(min(plot_data$prob), max(plot_data$prob))) + 
                   theme_minimal() + theme(legend.position = 'none', plot.margin=unit(c(1,-0.3,1,1), 'cm'))

p2 = plot_data %>% filter(log2FoldChange>=0) %>% mutate(DE = padj<0.05) %>% ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) + 
                   geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + ylab('') +
                   scale_y_continuous(position = 'right', limits = c(min(plot_data$prob), max(plot_data$prob))) +
                   theme_minimal() + theme(plot.margin = unit(c(1,1,1,-0.3), 'cm'), axis.ticks.y = element_blank())

grid.arrange(p1, p2, nrow=1, top = 'LFC vs model probability by gene', bottom = 'LFC')

rm(p1, p2)



Conclusion


The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plots)


Saving results

predictions = test_set

save(predictions, dataset, file='./../Data/Ridge_model_robust.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] expss_0.10.2       corrplot_0.84      car_3.0-7          carData_3.0-3     
##  [5] ROCR_1.0-7         gplots_3.0.3       caret_6.0-86       lattice_0.20-41   
##  [9] Rtsne_0.15         biomaRt_2.40.5     RColorBrewer_1.1-2 gridExtra_2.3     
## [13] viridis_0.5.1      viridisLite_0.3.0  plotly_4.9.2       knitr_1.28        
## [17] forcats_0.5.0      stringr_1.4.0      dplyr_0.8.5        purrr_0.3.3       
## [21] readr_1.3.1        tidyr_1.0.2        tibble_3.0.0       ggplot2_3.3.0     
## [25] tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.4-0                 plyr_1.8.6                 
##   [5] lazyeval_0.2.2              splines_3.6.3              
##   [7] BiocParallel_1.18.1         crosstalk_1.1.0.1          
##   [9] GenomeInfoDb_1.20.0         digest_0.6.25              
##  [11] foreach_1.5.0               htmltools_0.4.0            
##  [13] gdata_2.18.0                fansi_0.4.1                
##  [15] magrittr_1.5                checkmate_2.0.0            
##  [17] memoise_1.1.0               cluster_2.1.0              
##  [19] openxlsx_4.1.4              annotate_1.62.0            
##  [21] recipes_0.1.10              modelr_0.1.6               
##  [23] gower_0.2.1                 matrixStats_0.56.0         
##  [25] prettyunits_1.1.1           jpeg_0.1-8.1               
##  [27] colorspace_1.4-1            blob_1.2.1                 
##  [29] rvest_0.3.5                 haven_2.2.0                
##  [31] xfun_0.12                   crayon_1.3.4               
##  [33] RCurl_1.98-1.1              jsonlite_1.6.1             
##  [35] genefilter_1.66.0           survival_3.1-11            
##  [37] iterators_1.0.12            glue_1.3.2                 
##  [39] gtable_0.3.0                zlibbioc_1.30.0            
##  [41] ipred_0.9-9                 XVector_0.24.0             
##  [43] DelayedArray_0.10.0         shape_1.4.4                
##  [45] BiocGenerics_0.30.0         abind_1.4-5                
##  [47] scales_1.1.0                DBI_1.1.0                  
##  [49] Rcpp_1.0.4                  xtable_1.8-4               
##  [51] progress_1.2.2              htmlTable_1.13.3           
##  [53] foreign_0.8-75              bit_1.1-15.2               
##  [55] Formula_1.2-3               stats4_3.6.3               
##  [57] lava_1.6.7                  prodlim_2019.11.13         
##  [59] glmnet_3.0-2                htmlwidgets_1.5.1          
##  [61] httr_1.4.1                  acepack_1.4.1              
##  [63] ellipsis_0.3.0              pkgconfig_2.0.3            
##  [65] XML_3.99-0.3                farver_2.0.3               
##  [67] nnet_7.3-13                 dbplyr_1.4.2               
##  [69] locfit_1.5-9.4              tidyselect_1.0.0           
##  [71] labeling_0.3                rlang_0.4.5                
##  [73] reshape2_1.4.3              AnnotationDbi_1.46.1       
##  [75] munsell_0.5.0               cellranger_1.1.0           
##  [77] tools_3.6.3                 cli_2.0.2                  
##  [79] generics_0.0.2              RSQLite_2.2.0              
##  [81] broom_0.5.5                 evaluate_0.14              
##  [83] yaml_2.2.1                  ModelMetrics_1.2.2.2       
##  [85] bit64_0.9-7                 fs_1.4.0                   
##  [87] zip_2.0.4                   caTools_1.18.0             
##  [89] nlme_3.1-147                xml2_1.2.5                 
##  [91] compiler_3.6.3              rstudioapi_0.11            
##  [93] png_0.1-7                   curl_4.3                   
##  [95] e1071_1.7-3                 reprex_0.3.0               
##  [97] geneplotter_1.62.0          stringi_1.4.6              
##  [99] highr_0.8                   Matrix_1.2-18              
## [101] vctrs_0.2.4                 pillar_1.4.3               
## [103] lifecycle_0.2.0             data.table_1.12.8          
## [105] bitops_1.0-6                GenomicRanges_1.36.1       
## [107] latticeExtra_0.6-29         R6_2.4.1                   
## [109] KernSmooth_2.23-16          rio_0.5.16                 
## [111] IRanges_2.18.3              codetools_0.2-16           
## [113] MASS_7.3-51.5               gtools_3.8.2               
## [115] assertthat_0.2.1            SummarizedExperiment_1.14.1
## [117] DESeq2_1.24.0               withr_2.1.2                
## [119] S4Vectors_0.22.1            GenomeInfoDbData_1.2.1     
## [121] mgcv_1.8-31                 parallel_3.6.3             
## [123] hms_0.5.3                   grid_3.6.3                 
## [125] rpart_4.1-15                timeDate_3043.102          
## [127] class_7.3-16                rmarkdown_2.1              
## [129] pROC_1.16.2                 base64enc_0.1-3            
## [131] Biobase_2.44.0              lubridate_1.7.4